package cn.ac.iscas.webpage.extraction;

import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import cn.as.iscas.util.Tokenizer;

public class KeywordsExtractor {
	public Map<String, Double> getKeyWords(String html) {
		Map<String, Double> keywordsMap = new HashMap<String, Double>();
		try {
			Tokenizer t = new Tokenizer();
			Document doc = Jsoup.parse(html);
			String content = doc.text();
			List<String> keywordsList = t.segWords(content);

			Iterator<String> it = keywordsList.iterator();
			while (it.hasNext()) {
				String keywords = it.next();
				if (keywordsMap.containsKey(keywords)) {
					keywordsMap.put(keywords, keywordsMap.get(keywords) + 1);
				} else {
					keywordsMap.put(keywords, 1.0);
				}
			}

		} catch (Exception e) {
			e.printStackTrace();
		}
		return keywordsMap;
	}
}
